LightGBM Model Tuning¶

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.append("/Users/efim/PycharmProjects/")
sys.path.append("/Users/efim/PycharmProjects/SimpleAlgoTrade/model")
from SimpleAlgoTrade.model import FeatureEngineering as fe
from SimpleAlgoTrade.model.utils import ic_metric
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.inspection import PartialDependenceDisplay
from sklearn.metrics import mean_squared_error
from statsmodels.api import OLS, add_constant
import lightgbm as lgb
from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin
from hyperopt.pyll import scope
import hyperopt
import tqdm
from typing import Dict
from plotly import express as px
from plotly import graph_objects as go
from plotly import offline as pyo
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
matplotlib.rcParams['figure.figsize'] = (15, 10)
In [2]:
SEED = 123456789
np.random.seed(SEED)

Read Data¶

In [3]:
try:
    os.chdir("../DataBase/files")
except FileNotFoundError:
    pass
!ls -a
.
..
.DS_Store
Archive
price_execution_data_list_16-08-2022-13-24-21.json
price_execution_data_list_16-08-2022-15-32-04.json
trade_book_data_16-08-2022-13-24-21.json
trade_book_data_16-08-2022-15-32-04.json
In [4]:
file_name = "price_execution_data_list"

def read_data_files(name_like: str):
    data_dict = {}
    for file_name in os.listdir():
        if name_like in file_name:
            data_dict[file_name] = pd.read_json(file_name)
    return data_dict

def aggregate_dict_to_dataframe(dictinary):
    df = pd.DataFrame()
    for key, val in dictinary.items():
        df = pd.concat([df, dictinary[key]], axis = 0)
    df.set_index("time", inplace=True)
    return df

price_execution_data = read_data_files(file_name)
price_execution_data_df = aggregate_dict_to_dataframe(price_execution_data)


del price_execution_data

price_execution_data_df = price_execution_data_df.sort_index()
In [5]:
price_execution_data_df.head()
Out[5]:
symbol price delta_time
time
1.660649e+09 BTCUSDT 24071.72 1.332418
1.660649e+09 BTCUSDT 24071.40 0.611654
1.660649e+09 BTCUSDT 24072.47 1.330175
1.660649e+09 BTCUSDT 24073.85 0.592218
1.660649e+09 BTCUSDT 24072.48 0.634877

Feature Engineering¶

In [6]:
time_lag = 13 # the tick that the model will predict if 1 than the next tick will be used as y
In [7]:
if time_lag != 1:
    feature_shift_return_most_recent = make_pipeline(fe.FeatureSelector('price'), fe.CalcShift(1),fe.CalcReturn())
#feature_shift_return_most_recent

Shift Returns¶

In [8]:
feature_shift_returns = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcShift(i * time_lag),fe.CalcReturn()) for i in range(1,11)))
# feature_shift_returns

MA¶

In [9]:
feature_ma = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcMa(i * time_lag), fe.CalcReturn()) for i in range(1,11)))
# feature_ma

BB Low¶

In [10]:
feature_bb_low = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcBB(i * time_lag,"low"), fe.CalcReturn()) for i in range(1,11)))
# feature_bb_low

BB High¶

In [11]:
feature_bb_high = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcBB(i * time_lag,"high"), fe.CalcReturn()) for i in range(1,11)))
# feature_bb_high

Quantile 0.01¶

In [12]:
feature_quantile_001 = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcReturn(), fe.CalcQuantile(i * time_lag,0.01)) for i in range(1,11)))
# feature_quantile_001

Quantile 0.99¶

In [13]:
feature_quantile_099 = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'), fe.CalcReturn(), fe.CalcQuantile(i * time_lag,0.99)) for i in range(1,11)))
# feature_quantile_099

Rolling Std¶

In [14]:
feature_std= fe.make_union(*(make_pipeline(fe.FeatureSelector('price'),fe.CalcReturn(), fe.CalcStd(i * time_lag)) for i in range(1,11)))
# feature_std

RSI¶

In [15]:
feature_rsi = fe.make_union(*(make_pipeline(fe.FeatureSelector('price'),fe.CalcRsi(i*time_lag)) for i in range(1,11)))
# feature_rsi

PPO¶

In [16]:
feature_ppo = make_pipeline(fe.FeatureSelector('price'), fe.CalcPpo())
# feature_ppo

Macd¶

In [17]:
feature_macd = make_pipeline(fe.FeatureSelector('price'), fe.CalcMacd())
# feature_macd

Target¶

In [18]:
target = make_pipeline(fe.FeatureSelector('price'), fe.CalcShift(-time_lag),fe.CalcReturn())
# target

Aggregate all Features¶

In [19]:
feature_union = fe.make_union(feature_shift_return_most_recent,
                              feature_shift_returns,
                              feature_ma,
                              feature_bb_low,
                              feature_bb_high,
                              feature_quantile_001,
                              feature_quantile_099,
                              feature_std,
                              feature_rsi,
                              feature_ppo,
                              feature_macd,
                              target)                           
In [20]:
feature_union
Out[20]:
PandasFeatureUnion(transformer_list=[('pipeline-1',
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcshift',
                                                       CalcShift(shift_val=1)),
                                                      ('calcreturn',
                                                       CalcReturn())])),
                                     ('pandasfeatureunion-1',
                                      PandasFeatureUnion(transformer_list=[('pipeline-1',
                                                                            Pipeline(steps=[('featureselector',
                                                                                             FeatureSelector(feature_names='price')),
                                                                                            ('calc...
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcppo', CalcPpo())])),
                                     ('pipeline-3',
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcmacd',
                                                       CalcMacd())])),
                                     ('pipeline-4',
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcshift',
                                                       CalcShift(shift_val=-13)),
                                                      ('calcreturn',
                                                       CalcReturn())]))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PandasFeatureUnion(transformer_list=[('pipeline-1',
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcshift',
                                                       CalcShift(shift_val=1)),
                                                      ('calcreturn',
                                                       CalcReturn())])),
                                     ('pandasfeatureunion-1',
                                      PandasFeatureUnion(transformer_list=[('pipeline-1',
                                                                            Pipeline(steps=[('featureselector',
                                                                                             FeatureSelector(feature_names='price')),
                                                                                            ('calc...
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcppo', CalcPpo())])),
                                     ('pipeline-3',
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcmacd',
                                                       CalcMacd())])),
                                     ('pipeline-4',
                                      Pipeline(steps=[('featureselector',
                                                       FeatureSelector(feature_names='price')),
                                                      ('calcshift',
                                                       CalcShift(shift_val=-13)),
                                                      ('calcreturn',
                                                       CalcReturn())]))])
FeatureSelector(feature_names='price')
CalcShift(shift_val=1)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=13)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=26)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=39)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=52)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=65)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=78)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=91)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=104)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=117)
CalcReturn()
FeatureSelector(feature_names='price')
CalcShift(shift_val=130)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=13)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=26)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=39)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=52)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=65)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=78)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=91)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=104)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=117)
CalcReturn()
FeatureSelector(feature_names='price')
CalcMa(period=130)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=13)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=26)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=39)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=52)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=65)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=78)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=91)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=104)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=117)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=130)
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=13, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=26, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=39, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=52, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=65, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=78, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=91, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=104, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=117, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcBB(period=130, type_='high')
CalcReturn()
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=13, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=26, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=39, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=52, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=65, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=78, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=91, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=104, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=117, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=130, quantile_val=0.01)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=13, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=26, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=39, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=52, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=65, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=78, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=91, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=104, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=117, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcQuantile(period=130, quantile_val=0.99)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=13)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=26)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=39)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=52)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=65)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=78)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=91)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=104)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=117)
FeatureSelector(feature_names='price')
CalcReturn()
CalcStd(period=130)
FeatureSelector(feature_names='price')
CalcRsi(period=13)
FeatureSelector(feature_names='price')
CalcRsi(period=26)
FeatureSelector(feature_names='price')
CalcRsi(period=39)
FeatureSelector(feature_names='price')
CalcRsi(period=52)
FeatureSelector(feature_names='price')
CalcRsi(period=65)
FeatureSelector(feature_names='price')
CalcRsi(period=78)
FeatureSelector(feature_names='price')
CalcRsi(period=91)
FeatureSelector(feature_names='price')
CalcRsi(period=104)
FeatureSelector(feature_names='price')
CalcRsi(period=117)
FeatureSelector(feature_names='price')
CalcRsi(period=130)
FeatureSelector(feature_names='price')
CalcPpo()
FeatureSelector(feature_names='price')
CalcMacd()
FeatureSelector(feature_names='price')
CalcShift(shift_val=-13)
CalcReturn()

Create Train-Test Data¶

In [21]:
data = feature_union.fit_transform(price_execution_data_df)
In [22]:
data = data.dropna()
train_size = 0.8
test_size = 1-train_size
data_size = data.shape[0]

data_train = data.iloc[:int(data_size*train_size)]
data_test = data.iloc[int(data_size*train_size):]

target_names = [f"returns_forward_{time_lag}"]
feature_names = data.columns[:-1]

X_train, y_train = data_train[feature_names], data_train[target_names]
X_test, y_test = data_test[feature_names], data_test[target_names]
In [23]:
X_train.shape
Out[23]:
(15895, 83)
In [24]:
X_test.shape
Out[24]:
(3974, 83)

Hyper Parameter Tuning¶

In [25]:
# Time Series Cross Validation
def time_series_cross_validation(X: pd.DataFrame,
                                 y: pd.DataFrame,
                                 init_train_size:int,
                                 val_cv:int,
                                 model: callable,
                                 params: Dict):
    """
    X - feature data
    y - target data
    init_train_size - initial train data size
    cv - cross validation size
    """
    scores = {"train_ic":[],
              "val_ic":[],
              "sample_size":[],
              "train_rmse": [],
              "val_rmse": []}
    regressor = model(**params)
    for sample_size in range(init_train_size, X.shape[0], val_cv):
        X_train = X.iloc[:sample_size]
        y_train = y.iloc[:sample_size]
        X_val = X.iloc[sample_size:sample_size+val_cv]
        y_val = y.iloc[sample_size:sample_size+val_cv]
        regressor.fit(X_train,y_train)
        y_train_pred = regressor.predict(X_train)
        y_val_pred = regressor.predict(X_val)
        scores["train_ic"].append(ic_metric(y_train, y_train_pred))
        scores["val_ic"].append(ic_metric(y_val, y_val_pred))
        scores["train_rmse"].append(mean_squared_error(y_train,y_train_pred))
        scores["val_rmse"].append(mean_squared_error(y_val, y_val_pred))
        scores["sample_size"].append(sample_size)
        sample_size += val_cv
    return scores 

# Setup objective function
def objective(params):
    cv_config = params["cv_config"]
    X, y , init_train_size, val_cv, model = cv_config.get("X"), cv_config.get("y"), cv_config.get("init_train_size"), cv_config.get("val_cv"), cv_config.get("model")
    scores = time_series_cross_validation(X,
                                          y,
                                          init_train_size,
                                          val_cv,
                                          model,
                                          params["space"])  
    return -np.mean(scores["val_ic"])
In [26]:
# CV Config
cv_config = {"X": X_train,
            "y": y_train,
            "init_train_size": 10000,
            "val_cv": 250,
            "model": lgb.LGBMRegressor}

# Uninformative Priors
space = {
    'learning_rate':    hp.uniform('learning_rate', 0.01, 2),
    'num_leaves':       scope.int(hp.uniform('num_leaves', 2, 20)),
    'max_depth':        scope.int(hp.uniform('max_depth', 1, 20)),
    'subsample':        hp.uniform('subsample', 0.6, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1), #feature_fraction
    'num_iterations':   scope.int(hp.uniform('num_iterations',5, 400)),
    # 'reg_alpha':        hp.choice("reg_alpha", np.arange(0, 2, 0.1)), #L1 can shirnk most of the features to zero, which will produce straight line pred
    'reg_lambda':        hp.uniform("reg_lambda",0, 5), #L2
    'seed': SEED
        }

params = {"cv_config":cv_config,
          "space": space}

trials = Trials()
best = fmin(fn=objective,
            space=params,
            algo=tpe.suggest,
            max_evals=250,
            trials=trials,
            rstate=np.random.default_rng(SEED))
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [39:09<00:00,  9.40s/trial, best loss: -0.03382063418707294]
In [27]:
best
Out[27]:
{'colsample_bytree': 0.9073245190249477,
 'learning_rate': 0.8362765112862323,
 'max_depth': 5.795458069323013,
 'num_iterations': 154.30736269988873,
 'num_leaves': 11.409479385608066,
 'reg_lambda': 2.8689492539762513,
 'subsample': 0.8105926533316665}

Tuning Result Visualisation¶

In [28]:
def unpack(x):
    if x:
        return x[0]
    return np.nan

trials_df = pd.DataFrame([pd.Series(t["misc"]["vals"]).apply(unpack) for t in trials])
trials_df["loss"] = [t["result"]["loss"] for t in trials]
trials_df["trial_number"] = trials_df.index

int_col_names = ["max_depth", "num_iterations","num_leaves"]
for col_name in int_col_names:
    trials_df[col_name] = trials_df[col_name].apply(lambda x: round(x))
trials_df["corr"] = trials_df["loss"].apply(lambda x: -x) # the hyperopt goal is to minimize -corr
In [29]:
trials_df.sort_values(by="corr", ascending=False)
Out[29]:
colsample_bytree learning_rate max_depth num_iterations num_leaves reg_lambda subsample loss trial_number corr
2 0.907325 0.836277 6 154 11 2.868949 0.810593 -0.033821 2 0.033821
56 0.827558 0.884786 7 344 16 2.498218 0.905005 -0.031934 56 0.031934
113 0.897715 0.827892 15 175 19 3.895779 0.995305 -0.026505 113 0.026505
89 0.732165 0.549636 19 268 14 0.883844 0.744865 -0.026131 89 0.026131
130 0.915108 1.372797 17 161 18 3.946307 0.905825 -0.024781 130 0.024781
... ... ... ... ... ... ... ... ... ... ...
39 0.818831 1.259677 17 49 14 3.664916 0.812888 0.021504 39 -0.021504
232 0.836154 0.710190 6 302 13 4.280225 0.844149 0.024907 232 -0.024907
181 0.905013 0.761663 12 262 11 3.727888 0.815731 0.026336 181 -0.026336
216 0.927756 0.826142 11 231 19 3.412954 0.719116 0.029589 216 -0.029589
45 0.600650 1.055966 18 118 7 2.182601 0.656977 0.033866 45 -0.033866

250 rows × 10 columns

In [30]:
# Save trials data
path = "/Users/efim/PycharmProjects/SimpleAlgoTrade/model/data/"
trials_df.to_csv(path+"light_gbm_trials.csv")

Trials vs Objective Func¶

In [31]:
px.scatter(trials_df, x="trial_number", y="corr")

Contour plot¶

Feature vs Objective Func

Num Leaves vs Num Iteration

In [32]:
fig = go.Figure(
    data=go.Contour(
        z=trials_df["corr"],
        x=trials_df["num_iterations"],
        y=trials_df["num_leaves"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="corr", titleside="right",),
        hovertemplate="corr: %{z}<br>num_iterations: %{x}<br>num_leaves: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="num_iterations",
    yaxis_title="num_leaves",
    title={
        "text": "num_iterations vs. num_leaves",
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
    margin=dict(l=10, r=10, t=10, b=10)
)

learning_rate vs num_iterations

In [33]:
fig = go.Figure(
    data=go.Contour(
        z=trials_df["corr"],
        x=trials_df["num_iterations"],
        y=trials_df["learning_rate"],
        contours=dict(
            showlabels=True,  # show labels on contours
            labelfont=dict(size=12, color="white",),  # label font properties
        ),
        colorbar=dict(title="corr", titleside="right",),
        hovertemplate="corr: %{z}<br>num_iterations: %{x}<br>learning_rate: %{y}<extra></extra>",
    )
)

fig.update_layout(
    xaxis_title="num_iterations",
    yaxis_title="learning_rate",
    title={
        "text": "num_iterations vs. learning_rate",
        "xanchor": "center",
        "yanchor": "top",
        "x": 0.5,
    },
    margin=dict(l=10, r=10, t=10, b=10)
)

Re-Train the Model with best Params¶

In [34]:
def map_to_int(params):
    params_mapped = {}
    for key, val in params.items():
        if key in int_col_names:
            params_mapped[key] = round(val)
        else:
            params_mapped[key] = val
    return params_mapped
        
best_params = map_to_int(best)
print(best_params)
{'colsample_bytree': 0.9073245190249477, 'learning_rate': 0.8362765112862323, 'max_depth': 6, 'num_iterations': 154, 'num_leaves': 11, 'reg_lambda': 2.8689492539762513, 'subsample': 0.8105926533316665}
In [35]:
scores =  time_series_cross_validation(X_train,
                                       y_train,
                                       cv_config["init_train_size"],
                                       cv_config["val_cv"],
                                       cv_config["model"],
                                       best_params)
In [36]:
plt.plot(scores["train_ic"], label="Train IC")
plt.plot(scores["val_ic"], label="Val IC")
plt.legend()
Out[36]:
<matplotlib.legend.Legend at 0x7fdfb2ef3610>
In [37]:
np.mean(scores["val_ic"])
Out[37]:
0.009715709631665424
In [38]:
model = lgb.LGBMRegressor(**best_params)
model.fit(X_train, y_train)
Out[38]:
LGBMRegressor(colsample_bytree=0.9073245190249477,
              learning_rate=0.8362765112862323, max_depth=6, num_iterations=154,
              num_leaves=11, reg_lambda=2.8689492539762513,
              subsample=0.8105926533316665)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMRegressor(colsample_bytree=0.9073245190249477,
              learning_rate=0.8362765112862323, max_depth=6, num_iterations=154,
              num_leaves=11, reg_lambda=2.8689492539762513,
              subsample=0.8105926533316665)
In [39]:
top = 50
df_feat = (
    pd.DataFrame({"feat_importance": model.feature_importances_,
                  "feature_name": X_train.columns})
    .sort_values(by="feat_importance", ascending=False)
          )
df_feat.head(top).plot.bar(x="feature_name", y ="feat_importance")
plt.title(f"Top {top} feature imporance")
Out[39]:
Text(0.5, 1.0, 'Top 50 feature imporance')

Hyper Parameter Significance¶

In [40]:
ols = OLS(endog=trials_df['corr'], exog=add_constant(trials_df.drop(['loss','trial_number','corr'], axis=1))).fit()
print(ols.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   corr   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     1.636
Date:                Fri, 16 Sep 2022   Prob (F-statistic):              0.126
Time:                        11:27:17   Log-Likelihood:                 768.12
No. Observations:                 250   AIC:                            -1520.
Df Residuals:                     242   BIC:                            -1492.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -0.0052      0.009     -0.584      0.560      -0.023       0.012
colsample_bytree    -0.0066      0.008     -0.841      0.401      -0.022       0.009
learning_rate       -0.0032      0.002     -1.805      0.072      -0.007       0.000
max_depth         9.801e-05      0.000      0.699      0.485      -0.000       0.000
num_iterations   -1.569e-06   7.91e-06     -0.198      0.843   -1.72e-05     1.4e-05
num_leaves          -0.0001      0.000     -0.613      0.540      -0.000       0.000
reg_lambda           0.0005      0.001      0.899      0.369      -0.001       0.002
subsample            0.0192      0.007      2.602      0.010       0.005       0.034
==============================================================================
Omnibus:                        1.314   Durbin-Watson:                   1.965
Prob(Omnibus):                  0.518   Jarque-Bera (JB):                1.024
Skew:                          -0.132   Prob(JB):                        0.599
Kurtosis:                       3.170   Cond. No.                     3.60e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.6e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [41]:
pd.concat([ols.params.to_frame('coef'),
           ols.conf_int().rename({0:"lower",1:"upper"},axis=1),
          ], axis=1)
Out[41]:
coef lower upper
const -0.005240 -0.022913 0.012432
colsample_bytree -0.006641 -0.022202 0.008920
learning_rate -0.003170 -0.006630 0.000289
max_depth 0.000098 -0.000178 0.000374
num_iterations -0.000002 -0.000017 0.000014
num_leaves -0.000106 -0.000445 0.000234
reg_lambda 0.000526 -0.000627 0.001680
subsample 0.019159 0.004653 0.033665

Model Interperation Partial Depedency¶

Note: Correlation in between features are ignored

In [42]:
num_feat = 10
fig, ax = plt.subplots(10,1, figsize=(10, 90))
for i in range(num_feat):
    feat = df_feat["feature_name"][i]
    PartialDependenceDisplay.from_estimator(model, X_train, [feat], kind='both', ax=ax[i])

Model Prediction Returns¶

In [43]:
plt.plot(y_train.values, label = "actual")
plt.plot(model.predict(X_train), label ="pred")
plt.title("Train Set")
plt.legend()
Out[43]:
<matplotlib.legend.Legend at 0x7fdfa49b49a0>
In [44]:
plt.plot(y_test.values, label = "actual")
plt.plot(model.predict(X_test), label ="pred")
plt.title("Test Set")
plt.legend()
Out[44]:
<matplotlib.legend.Legend at 0x7fdfb4a71d50>

Model Prediction Prices¶

In-Sample Data¶

In [45]:
y_train_pred = pd.Series(model.predict(X_train), index = X_train.index, name=f"train_return_pred_{time_lag}")
price_execution_data_df[f"price_shift_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag)
price_execution_data_df[f"price_pct_change_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag).pct_change()
price_execution_data_df["multiplicator"] = price_execution_data_df[f"price_shift_{time_lag}"].shift(1)
full_data_train = price_execution_data_df.merge(y_train, left_index=True, right_index=True)
full_data_train = full_data_train.merge(y_train_pred, left_index=True, right_index=True)
full_data_train.head(10)
Out[45]:
symbol price delta_time price_shift_13 price_pct_change_13 multiplicator returns_forward_13 train_return_pred_13
time
1.660649e+09 BTCUSDT 24078.99 0.613602 24078.93 0.000155 24075.19 0.000155 0.000026
1.660649e+09 BTCUSDT 24080.96 0.613479 24077.98 -0.000039 24078.93 -0.000039 -0.000017
1.660649e+09 BTCUSDT 24079.51 1.428748 24078.31 0.000014 24077.98 0.000014 -0.000016
1.660649e+09 BTCUSDT 24078.57 0.614810 24078.82 0.000021 24078.31 0.000021 -0.000064
1.660649e+09 BTCUSDT 24079.09 0.613618 24078.31 -0.000021 24078.82 -0.000021 -0.000039
1.660649e+09 BTCUSDT 24079.84 0.613721 24078.74 0.000018 24078.31 0.000018 0.000016
1.660649e+09 BTCUSDT 24078.14 0.613349 24076.06 -0.000111 24078.74 -0.000111 -0.000043
1.660649e+09 BTCUSDT 24078.53 1.432766 24074.90 -0.000048 24076.06 -0.000048 -0.000018
1.660649e+09 BTCUSDT 24075.34 1.334504 24078.35 0.000143 24074.90 0.000143 0.000011
1.660649e+09 BTCUSDT 24073.91 0.612562 24078.91 0.000023 24078.35 0.000023 -0.000031
In [46]:
full_data_train.shape
Out[46]:
(15895, 8)
In [47]:
assert ((full_data_train["multiplicator"]*(1+full_data_train[f"returns_forward_{time_lag}"]) - full_data_train[f"price_shift_{time_lag}"]) < 1e-6).all(), "Oops Prices are not alligned"
In [48]:
full_data_train[f"train_price_pred_{time_lag}"]  = full_data_train["multiplicator"]*(1+full_data_train[f"train_return_pred_{time_lag}"])
In [49]:
full_data_train[f"train_price_pred_{time_lag}"].iloc[:100].plot(label="y_pred")
full_data_train[f"price_shift_{time_lag}"].iloc[:100].plot(label="y_true")
plt.legend()
Out[49]:
<matplotlib.legend.Legend at 0x7fdfa51fdfc0>

Out-Sample Data¶

In [50]:
y_test_pred = pd.Series(model.predict(X_test), index = X_test.index, name=f"test_return_pred_{time_lag}")
price_execution_data_df[f"price_shift_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag)
price_execution_data_df[f"price_pct_change_{time_lag}"] = price_execution_data_df["price"].shift(-time_lag).pct_change()
price_execution_data_df["multiplicator"] = price_execution_data_df[f"price_shift_{time_lag}"].shift(1)
full_data_test = price_execution_data_df.merge(y_test, left_index=True, right_index=True)
full_data_test = full_data_test.merge(y_test_pred, left_index=True, right_index=True)
full_data_test.dropna(inplace=True)
full_data_test.head(10)
Out[50]:
symbol price delta_time price_shift_13 price_pct_change_13 multiplicator returns_forward_13 test_return_pred_13
time
1.660661e+09 BTCUSDT 23757.88 0.656786 23763.33 0.000023 23762.78 0.000023 0.000015
1.660661e+09 BTCUSDT 23756.04 0.613033 23764.46 0.000048 23763.33 0.000048 -0.000004
1.660661e+09 BTCUSDT 23759.18 0.612939 23762.64 -0.000077 23764.46 -0.000077 0.000031
1.660661e+09 BTCUSDT 23759.27 0.612498 23757.75 -0.000206 23762.64 -0.000206 0.000030
1.660661e+09 BTCUSDT 23759.75 1.330198 23758.22 0.000020 23757.75 0.000020 0.000073
1.660661e+09 BTCUSDT 23760.90 0.715904 23759.08 0.000036 23758.22 0.000036 0.000044
1.660661e+09 BTCUSDT 23765.09 0.614189 23760.25 0.000049 23759.08 0.000049 -0.000013
1.660661e+09 BTCUSDT 23766.28 0.612893 23759.44 -0.000034 23760.25 -0.000034 -0.000024
1.660661e+09 BTCUSDT 23767.16 0.576914 23757.04 -0.000101 23759.44 -0.000101 0.000011
1.660661e+09 BTCUSDT 23765.95 0.650788 23756.31 -0.000031 23757.04 -0.000031 -0.000006
In [51]:
assert ((full_data_test["multiplicator"]*(1+full_data_test[f"returns_forward_{time_lag}"]) - full_data_test[f"price_shift_{time_lag}"]) < 1e-6).all(), "Oops Prices are not alligned"
In [52]:
full_data_test[f"test_price_pred_{time_lag}"]  = full_data_test["multiplicator"]*(1+full_data_test[f"test_return_pred_{time_lag}"])
In [53]:
full_data_test[f"test_price_pred_{time_lag}"].iloc[:100].plot(label="y_pred")
full_data_test[f"price_shift_{time_lag}"].iloc[:100].plot(label="y_true")
plt.legend()
Out[53]:
<matplotlib.legend.Legend at 0x7fdfa522c8e0>

Save Prediction¶

In [55]:
# # Train Data
path = "/Users/efim/PycharmProjects/SimpleAlgoTrade/model/data/"
# full_data_train[["price","price_shift_13","train_price_pred_13"]].to_csv(path+"train_pred.csv")

# # Test Data
# full_data_test[["price","price_shift_13","test_price_pred_13"]].to_csv(path+"test_pred.csv")

y_train_pred = pd.merge(pd.Series(model.predict(X_train), index = X_train.index, name="train_return_pred_13"),
                        price_execution_data_df["price"],
                        left_index=True,
                        right_index=True)
                        
y_test_pred = pd.merge(pd.Series(model.predict(X_test), index = X_test.index, name="test_return_pred_13"),
                       price_execution_data_df["price"],
                       left_index=True,
                       right_index=True)

y_train_pred.to_csv(path+"train_pred.csv")
y_test_pred.to_csv(path+"test_pred.csv")
In [ ]: